Class 6 Multilevel Model
Data Import
(you don’t actually need to run this)
library(WDI)
library(tidyverse)
library(countrycode)
indicators <- list('gini' = 'SI.POV.GINI',
'wlfp' = 'SL.TLF.ACTI.ZS' ,# Labor Force Participation Rate (%), Female
'gdp_pc_ppp' = "NY.GDP.PCAP.PP.KD",# "GDP per capita, PPP (constant 2005 international $)"
'resource_rents' = 'NY.GDP.TOTL.RT.ZS'
)
wdi <- WDI(indicator=indicators,
start = 2020,
end = 2023
)
wdi$country_id<- countrycode(wdi$iso3c, origin='iso3c', destination ='vdem')
wdi_filled<-wdi|>
group_by(country)|>
arrange(year)|>
fill(c(resource_rents, gini, wlfp), .direction='downup')|>
slice_tail(n=1)
polregions<-c('Eastern Europe', # (including German Democratic Republic, excluding the Caucasus)
'Latin America and the Caribbean',
'The Middle East and North Africa', # (including Israel and Türkiye, excluding Cyprus)
'Sub-Saharan Africa',
'Western Europe and North America', # (including Cyprus, but excluding German Democratic Republic)
'East Asia and the Pacific',
'South and Central Asia' #(including the Caucasus)
)
# Electoral system data
elsystem<-vdemdata::vdem|>
arrange(year)|>
filter(year>=2010)|>
group_by(country_name)|>
fill(v2elparlel, .direction='down')|>
slice_tail(n=1)|>
select(country_name,v2elparlel)
vdem<-vdemdata::vdem|>
arrange(year)|>
filter(v2x_elecreg == 1)|>
filter(year>=2020)|>
group_by(country_name)|>
select(starts_with('country'), v2lgfemleg, v2x_elecreg, v2lgqugen, v2lgqugent, e_regionpol_7C,
e_fh_pr, e_fh_cl,e_fh_status)|>
fill(everything(),.direction='down')|>
slice_tail(n=1)|>
ungroup()|>
left_join(wdi_filled, by=join_by(country_id == country_id))|>
select(-starts_with("iso"), -country_id, -country, -year)|>
left_join(elsystem, by='country_name')|>
mutate(region = factor(e_regionpol_7C, labels=polregions),
log_gdp_pc_ppp = log(gdp_pc_ppp),
fh_status = factor(e_fh_status, levels=c(1,2, 3), labels=c('Free', "Partly Free", "Not Free")),
fh_status = fct_relevel(fh_status, "Not Free", "Partly Free", "Free"),
free_dummy = factor(fh_status == "Free", labels=c("Not Free", "Free")),
majoritarian_dummy = factor(v2elparlel ==0, labels=c("Not Majoritarian", "Majoritarian"))
)
vdem<-set_variable_labels(vdem,
'v2lgfemleg' = "Lower chamber % female legislators",
'v2x_elecreg' = "",
'v2lgqugen' = 'Lower chamber gender quota',
'v2lgqugent' = 'Lower chamber gender quota threshold',
'resource_rents' = 'Total natural resources rents (% of GDP)',
'gini'= 'Gini coefficient',
'wlfp' = "Female labor force participation",
'log_gdp_pc_ppp' = 'Logged GDP per capita, PPP',
'region' ='Region (politico-geographic 7-category)',
'majoritarian_dummy' = 'Lower chamber electoral system'
)
vdem_cleaned<-vdem|>
drop_na(v2lgfemleg, v2lgqugen , wlfp , resource_rents , log_gdp_pc_ppp, e_fh_pr, e_fh_cl)
#saveRDS(vdem_cleaned,file='vdem_cleaned.rds')if(!file.exists('vdem_clean.rds')){
download.file('https://github.com/Neilblund/GVPT728_Winter24/raw/refs/heads/main/Additional%20Code%20and%20Data/vdem_cleaned.rds',
'vdem_clean.rds',
mode='wb')
}
vdem_cleaned<-readRDS('vdem_clean.rds')Describing the main relationship
In general, do countries with majoritarian electoral systems elect fewer women?
relationship <- ggplot(vdem_cleaned, aes(x = majoritarian_dummy, y = v2lgfemleg)) +
geom_half_point(aes(color = majoritarian_dummy),
transformation = position_quasirandom(width = 0.1),
side = "l", size = 1, alpha = 0.5) +
geom_half_boxplot(aes(fill = majoritarian_dummy), side = "r") +
guides(color = "none", fill = "none") +
labs(x = "Lower House Electoral System", y = "% women in legislature") +
theme_bw() +
scale_fill_brewer(palette='Dark2') +
scale_color_brewer(palette='Dark2')
relationshipHowever: might want to control for some additional factors that could influence this relationship.
Women’s Labor Force Participation
logged GDP
Whether a country has quotas guaranteeing a certain number of seats for women in the legislature.
data<-vdem_cleaned|>
select(v2lgfemleg, wlfp, log_gdp_pc_ppp, v2lgqugen, majoritarian_dummy)
GGally::ggpairs(data, aes(color=majoritarian_dummy)) +
theme_bw() +
scale_color_brewer(palette='Dark2') +
scale_fill_brewer(palette='Dark2') Using OLS
model_0<-lm(v2lgfemleg ~
majoritarian_dummy +
wlfp +
log_gdp_pc_ppp +
v2lgqugen,
data=vdem_cleaned)
modelsummary(list("OLS" = model_0),
coef_rename=TRUE,
coef_omit = "Intercept",
estimate = "{estimate}",
statistic = c("conf.int"),
conf_level = .95,
note = "95% CI in brackets",
gof_omit = 'F|RMSE|R2$|AIC|Log.Lik.',
)| OLS | |
|---|---|
| 95% CI in brackets | |
| Lower chamber electoral system [Majoritarian] | -6.195 |
| [-10.102, -2.289] | |
| Female labor force participation | 0.304 |
| [0.132, 0.477] | |
| Logged GDP per capita, PPP | 1.316 |
| [-0.288, 2.921] | |
| Lower chamber gender quota | 2.332 |
| [1.140, 3.525] | |
| Num.Obs. | 157 |
| R2 Adj. | 0.229 |
| BIC | 1219.2 |
The effect of majoritarianism (relative to non-majoritarian electoral systems) is a estimated to be around a 6% decrease in the % women in the legislature. The presence of quotas, unsurprisingly, also has some impact, as does female labor force participation. While GDP may also play a role, the 95% confidence interval contains zero and is not statistically significant at conventional levels.
Fixed Effects
We might be concerned that this model fails to capture important variation. For instance: maybe cultural norms, colonial legacies, or simple geography play a major role in determining the outcome of interest?
ggplot(vdem_cleaned, aes(x=wlfp, y=v2lgfemleg)) +
geom_point(aes(color=majoritarian_dummy)) +
theme_bw() +
scale_color_brewer(palette='Dark2') +
scale_fill_brewer(palette='Dark2') +
labs(x="Women's Labor Force Participation",
y="% women in the legislature",
color = 'Electoral System'
) +
facet_wrap(~region)While we don’t have direct measures of a lot of these characteristics, we might try to account for them by including k-1 region-level dummy variables in the model. This would be the a fixed effects model:
model_1<-feols(v2lgfemleg ~
majoritarian_dummy +
wlfp +
log_gdp_pc_ppp +
v2lgqugen | region,
data=vdem_cleaned,
cluster ~ region)
model_1OLS estimation, Dep. Var.: v2lgfemleg
Observations: 157
Fixed-effects: region: 7
Standard-errors: Clustered (region)
Estimate Std. Error t value Pr(>|t|)
majoritarian_dummyMajoritarian -5.449041 2.179653 -2.49996 0.0465309 *
wlfp 0.195182 0.082976 2.35227 0.0568819 .
log_gdp_pc_ppp 1.735662 1.109650 1.56415 0.1688154
v2lgqugen 2.129287 0.529949 4.01791 0.0069754 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
RMSE: 9.90766 Adj. R2: 0.307379
Within R2: 0.175529
modelsummary(list("OLS" = model_0, "Region FE" = model_1),
coef_rename=TRUE,
coef_omit = "Intercept",
estimate = "{estimate}",
statistic = c("conf.int"),
conf_level = .95,
note = "95% CI in brackets",
gof_omit = 'F|RMSE|R2$|AIC|Log.Lik.',
)| OLS | Region FE | |
|---|---|---|
| 95% CI in brackets | ||
| Lower chamber electoral system [Majoritarian] | -6.195 | -5.449 |
| [-10.102, -2.289] | [-10.782, -0.116] | |
| Female labor force participation | 0.304 | 0.195 |
| [0.132, 0.477] | [-0.008, 0.398] | |
| Logged GDP per capita, PPP | 1.316 | 1.736 |
| [-0.288, 2.921] | [-0.980, 4.451] | |
| Lower chamber gender quota | 2.332 | 2.129 |
| [1.140, 3.525] | [0.833, 3.426] | |
| Num.Obs. | 157 | 157 |
| R2 Adj. | 0.229 | 0.307 |
| R2 Within | 0.176 | |
| R2 Within Adj. | 0.153 | |
| BIC | 1219.2 | 1221.3 |
| Std.Errors | by: region | |
Notably, the inclusion of fixed effects here doesn’t drastically alter our estimates, and we still find that majoritarian systems have about the same expected negative effect on the dependent variable that they did in our original linear model.
While the fixed effects terms are automatically left out of the model output here, we can view them by running:
fixef(model_1)$region
East Asia and the Pacific Eastern Europe
-9.1597424 -6.6165617
Latin America and the Caribbean South and Central Asia
-1.8319281 -7.5443107
Sub-Saharan Africa The Middle East and North Africa
-2.8207371 -13.3610522
Western Europe and North America
0.9046372
attr(,"class")
[1] "fixest.fixef" "list"
attr(,"exponential")
[1] FALSE
We can think of these as the separate Y-intercept terms for each region in our data set.
fixef_slopes<-predict_response(model_1, terms=c('majoritarian_dummy','region'))|>
plot(ci=FALSE, connect_lines=TRUE, show_data=TRUE, jitter=TRUE) +
ggtitle("Fixed Effects") +
scale_color_brewer(palette='Dark2')
fixef_slopesMultilevel Model
While both the standard OLS model and the fixed effects model seem to work reasonably well here, there may be cases where a multilevel model is more appropriate. This could be for several reasons. For instance:
We might think that the fixed effect for some small regions are implausibly low or high. After all: if a region only has a handful of countries, we might find that region has a really high or really low value of the dependent variable by random chance. Since random effects pool estimates across levels of the random effect, a random effects model may be a better fit.
We might want to investigate a characteristic that doesn’t vary much or at all within regions. For instance: we might have a variable for colonial legacy, primary religion, or membership in a regional organization. Since some of these will vary very little within each region, these controls would be perfectly or near-perfectly co-linear with the fixed effects estimator.
To run a linear random effects model, we’ll use the lmer function and include a random effect term. The lme4 package provides a whole bunch of ways to specify complex hierarchical models, but if we have a single random effect term, this is pretty straightforward: we just add something like:
+ (1|random_effect_term)
…to our formula. So here’s how I can create a region-level random effect:
model_2<-lmer(v2lgfemleg ~
majoritarian_dummy +
wlfp +
log_gdp_pc_ppp +
v2lgqugen +
(1|region), data=vdem_cleaned)We’ll take a look at the summary output:
summary(model_2)Linear mixed model fit by REML ['lmerMod']
Formula: v2lgfemleg ~ majoritarian_dummy + wlfp + log_gdp_pc_ppp + v2lgqugen +
(1 | region)
Data: vdem_cleaned
REML criterion at convergence: 1176.9
Scaled residuals:
Min 1Q Median 3Q Max
-1.88705 -0.66728 0.04488 0.64984 2.88985
Random effects:
Groups Name Variance Std.Dev.
region (Intercept) 17.33 4.163
Residual 105.53 10.273
Number of obs: 157, groups: region, 7
Fixed effects:
Estimate Std. Error t value
(Intercept) -7.60650 10.82151 -0.703
majoritarian_dummyMajoritarian -5.74595 1.92982 -2.977
wlfp 0.22878 0.08766 2.610
log_gdp_pc_ppp 1.71625 1.04148 1.648
v2lgqugen 2.16646 0.58354 3.713
Correlation of Fixed Effects:
(Intr) mjrt_M wlfp lg_g__
mjrtrn_dmmM -0.095
wlfp -0.328 -0.066
lg_gdp_pc_p -0.816 0.061 -0.245
v2lgqugen -0.268 0.260 0.156 0.100
The random effects terms can be accessed separately. These values should correspond to the average difference between the group-specific intercept vs. the overall intercept term:
ranef(model_2)$region
(Intercept)
Eastern Europe -0.9922269
Latin America and the Caribbean 2.8991079
The Middle East and North Africa -5.2553624
Sub-Saharan Africa 2.5144694
Western Europe and North America 4.8133411
East Asia and the Pacific -2.7656509
South and Central Asia -1.2136781
with conditional variances for "region"
Note that we don’t automatically get standard errors for these estimates. There are a number of methods for getting confidence intervals around a random effect. The estimate_grouplevels from the modelbased package is an easy one:
estimate_grouplevel(model_2)How important are the random effects for our model? One way to quantify this is using the intra-class correlation coefficient, which can be interpreted as the proportion of the total variance attributable to the grouping variable:
performance::icc(model_2)Now we can summarize our results in a single table:
model_list<-list("OLS" = model_0, "Region FE" = model_1, "Region RE" =model_2)
modelsummary(model_list,
coef_rename=TRUE,
coef_omit = "Intercept",
estimate = "{estimate}",
statistic = c("conf.int"),
conf_level = .95,
note = "95% CI in brackets",
gof_omit = 'F|RMSE|R2$|AIC|Log.Lik.|R2 Marg|R2 Cond|R2 Within',
)| OLS | Region FE | Region RE | |
|---|---|---|---|
| 95% CI in brackets | |||
| Lower chamber electoral system [Majoritarian] | -6.195 | -5.449 | -5.746 |
| [-10.102, -2.289] | [-10.782, -0.116] | [-9.559, -1.933] | |
| Female labor force participation | 0.304 | 0.195 | 0.229 |
| [0.132, 0.477] | [-0.008, 0.398] | [0.056, 0.402] | |
| Logged GDP per capita, PPP | 1.316 | 1.736 | 1.716 |
| [-0.288, 2.921] | [-0.980, 4.451] | [-0.342, 3.774] | |
| Lower chamber gender quota | 2.332 | 2.129 | 2.166 |
| [1.140, 3.525] | [0.833, 3.426] | [1.013, 3.319] | |
| SD (Observations) | 10.273 | ||
| Num.Obs. | 157 | 157 | 157 |
| R2 Adj. | 0.229 | 0.307 | |
| BIC | 1219.2 | 1221.3 | 1212.3 |
| ICC | 0.1 | ||
| Std.Errors | by: region | ||
And/Or plot the coefficients
modelplot(model_list,
coef_rename=TRUE,
coef_omit = c(1)
) +
scale_color_brewer(palette='Dark2') +
geom_vline(xintercept=0, lty=2)In contrast to the fixed effects model, which allows each region to have a totally separate Y-intercept, the random effects model “pools” the estimated intercepts across each group and shrinks them toward the global mean. The result is that the differences in the intercept terms are less dramatic, especially for smaller groups where there’s less data.
raneff_slopes<-predict_response(model_2, terms=c('majoritarian_dummy','region'),
margin='empirical')|>
plot( ci=FALSE, show_data=TRUE, connect_lines=TRUE, jitter=TRUE) +
ggtitle("Region Random Effects") +
scale_color_brewer(palette='Dark2')
fixef_slopes / raneff_slopes +
plot_layout(guides = "collect")